This is a solution generated for the Kaggle Tabular Playground Series, July 2021. It primarily uses tidy principles in R (tidyverse, tidymodels).
set.seed(518)
# packages
library(tidyverse)
library(tidymodels)
library(readr)
library(xgboost)
library(chron)
library(vip)
library(fable)
# datasets
train = readr::read_csv("train.csv")
test = readr::read_csv("test.csv")
test_id = as.character(test$date_time) # for submission
The initial dataset is rather tidy, though we note several key issues that interfere with inference. First, the data have a date-time variable, which will require further manipulation. Second, the test data have quite a different distribution of dates compared to the training data: the training data lasts from March 2010 to December 2010, while the testing data lasts from January 2011 to April 2011. Finally, we are predicting three different outcomes using the same dataset.
tidy = function(.data, is.training) {
# applies season transformation
.SeasonConvert = function(month) {
if (month == 12 | month == 1 | month == 2) return(1)
else if (month >= 3 & month <= 5) return(2)
else if (month >= 6 & month <= 8) return(3)
else return(4)
}
.VecSeasonConvert = Vectorize(.SeasonConvert, vectorize.args = "month")
# applies date-time transformations
fix_date = .data %>%
dplyr::mutate(date_time = as.POSIXct(date_time),
month = lubridate::month(date_time),
time = as.factor(lubridate::hour(date_time)),
quarter = as.factor(.VecSeasonConvert(month)),
temp_date = as.Date(lubridate::date(date_time)),
time_const = as.numeric(date_time))
# returns training data as a list of three datasets for each outcome
# or, returns tidy testing data
if (is.training) {
to_return = list(
carbon = data.frame(dplyr::select(fix_date, -c("target_benzene",
"target_nitrogen_oxides")) %>%
dplyr::rename(outcome = "target_carbon_monoxide")),
benzene = data.frame(dplyr::select(fix_date, -c("target_carbon_monoxide",
"target_nitrogen_oxides")) %>%
dplyr::rename(outcome = "target_benzene")),
nitrogen = data.frame(dplyr::select(fix_date, -c("target_carbon_monoxide",
"target_benzene")) %>%
dplyr::rename(outcome = "target_nitrogen_oxides"))
)
return(to_return)
}
else return(fix_date)
}
train = tidy(train, is.training = TRUE)
test = tidy(test, is.training = FALSE)
# variable vector
numeric_predictors = c("deg_C", "relative_humidity", "absolute_humidity",
"sensor_1", "sensor_2", "sensor_3", "sensor_4", "sensor_5")
Using the above, we convert the date into a POSIXct data type, and from then collect data on the month, time, and quarter of the data. Note that because the training and testing data cover different months, it is impossible to include dummy variables by month in the analysis; however, it would be reasonable to include categorical variables by season of the year. We do so using the “month” variable above. Finally, using the date, we create a simple date-time constant to account for any other unseen features in this variable; methods like trees will automatically discard particularly unuseful predictors.
# correlation plot
cor = train$carbon %>%
dplyr::select(all_of(numeric_predictors)) %>%
corrr::correlate(quiet = TRUE)
cor
## # A tibble: 8 x 9
## term deg_C relative_humidi… absolute_humidi… sensor_1 sensor_2 sensor_3
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 deg_C NA -0.668 0.445 0.0175 0.133 -0.145
## 2 relative… -0.668 NA 0.249 0.0931 -0.0352 -0.102
## 3 absolute… 0.445 0.249 NA 0.106 0.237 -0.485
## 4 sensor_1 0.0175 0.0931 0.106 NA 0.812 -0.592
## 5 sensor_2 0.133 -0.0352 0.237 0.812 NA -0.819
## 6 sensor_3 -0.145 -0.102 -0.485 -0.592 -0.819 NA
## 7 sensor_4 0.308 0.0270 0.567 0.643 0.812 -0.741
## 8 sensor_5 -0.0506 0.126 0.125 0.861 0.863 -0.706
## # … with 2 more variables: sensor_4 <dbl>, sensor_5 <dbl>
cor %>%
corrr::rplot() + theme_minimal()
Through a simple correlation plot, we see that sensors 2 and 3, 2 and 4, and 2 and 5 are strongly correlated (correlation > 0.8). While we do not yet transform the data to address this, this correlation implies possible use of dimension reduction techniques like PCA. Otherwise, we can assume a boosted trees or neural network approach will handle this correlation reasonably well.
The date-time variable also provides inquiry into what further data wrangling may be needed. To analyze the data, we create time series plots by plotting the average carbon/benzene/nitrogen levels by day as well as the levels by date and time. In particular, for the day plot, we separate by weekend and day of the week to analyze whether these may be particularly important variables.
SimpleTimeSeries = function(.data, weekend) {
if (weekend) {
return(
.data %>%
dplyr::mutate(weekend = chron::is.weekend(temp_date),
day = lubridate::wday(temp_date)) %>%
dplyr::group_by(temp_date) %>%
dplyr::summarize(mean = mean(outcome),
weekend = mean(weekend)) %>%
dplyr::ungroup() %>%
ggplot2::ggplot(aes(x = temp_date, y = mean, color = as.factor(weekend))) +
geom_line() +
geom_point(aes(color = as.factor(weekend))) +
theme_minimal()
)
}
else return(
.data %>%
dplyr::mutate(weekend = chron::is.weekend(temp_date),
day = lubridate::wday(temp_date)) %>%
dplyr::group_by(temp_date) %>%
dplyr::summarize(mean = mean(outcome),
day = mean(day)) %>%
dplyr::ungroup() %>%
ggplot2::ggplot(aes(x = temp_date, y = mean, color = as.factor(day))) +
geom_line() +
geom_point(aes(color = as.factor(day))) +
theme_minimal()
)
}
SimpleTimeSeries(train$carbon, TRUE) +
labs(title = "Average Carbon Levels by Date, Weekend")
SimpleTimeSeries(train$carbon, FALSE) +
labs(title = "Average Carbon Levels by Date, Day")
SimpleTimeSeries(train$benzene, TRUE) +
labs(title = "Average Benzene Levels by Date, Weekend")
SimpleTimeSeries(train$benzene, FALSE) +
labs(title = "Average Benzene Levels by Date, Day")
SimpleTimeSeries(train$nitrogen, TRUE) +
labs(title = "Average Nitrogen Levels by Date, Weekend")
SimpleTimeSeries(train$nitrogen, FALSE) +
labs(title = "Average Nitrogen Levels by Date, Day")
addWeekend = function(.data) {
return(
.data %>%
dplyr::mutate(weekend = as.factor(chron::is.weekend(temp_date)))
)
}
train = lapply(train, addWeekend)
test = addWeekend(test)
intTimeSeries = function(.data, title) {
temp_plot = .data %>%
ggplot2::ggplot(aes(x = date_time, y = outcome)) +
geom_line() +
theme_minimal()
return(temp_plot %>% plotly::ggplotly())
}
outcome_ts = lapply(train, intTimeSeries)
outcome_ts[[1]]
outcome_ts[[2]]
outcome_ts[[3]]
As seen above, pollution levels are almost always consistently lower on weekends, suggesting the need for “weekend” as an explanatory variable. However, we see that certain weekdays also have strong downwards trends. A possible explanation for this could be holidays (and we do see much more intense variations in the holiday months of November and December); for now, we include a weekend variable, but not a holiday variable.
Furthermore, we see that the time series for the outcomes seem fairly normal, except for benzene pollution: there are several “bad” data points, where the pollution levels drop suddenly and persistently to 0.1 for some period. We can follow up with further analysis of predictor variables to see if these errors are predictable by the variables. In particular, we will pay close attention to the dates between 8/26/2010 to 8/28/2010 and 12/14/2010 to 12/17/2010.
Thus, we also plot time series for the sensors and other predictors.
all_data = rbind(select(train$carbon, -"outcome") %>% dplyr::mutate(train = 1),
test %>% dplyr::mutate(train = 0))
plotSensor = function(.data, variable) {
return(
ggplot2::ggplot(.data, aes(x = date_time, y = variable)) +
geom_line() +
theme_minimal()
)
}
# store time series in list
time_series = NULL
variables = colnames(all_data)
for(i in 2:9) {
time_series[[i]] = (plotSensor(all_data, all_data[, i]) +
labs(title = paste("Time Series for", variables[i]))) %>%
plotly::ggplotly()
# time_series[[1]] is unused :(
}
time_series[[2]]
time_series[[3]]
time_series[[4]]
time_series[[5]]
time_series[[6]]
time_series[[7]]
time_series[[8]]
time_series[[9]]
As expected, the sensors have produced many outliers, represented by sudden drops or increases in the data. In fact, through data exploration, we can see that every possible sensor failure in the prediction variables almost always leads to some strange variation in the benzene outcome, but not the carbon/nitrogen outcomes. Furthermore, sensor failures are much more observable in certain predictors than in others, such as absolute_humidity. Finally, we see that sensor failures persist into the testing data, indicating that these failures could be accounted for to possibly improve accuracy.
Auto-identifying these features proves to be quite difficult, given that even though erroneous values produce somewhat extreme values, they are still somewhat reasonable in the range of possible values. Thus, we encode sensor failures by hand, using the absolute_humidity variable (since they are seemingly most identifiable there). Sensor failures are encoded as a simple indicator variable, sensor_failure.
# adds sensor failure indicator variable
addSensorFailure = function(.data) {
# failure conditions added by hand
return(
.data %>%
dplyr::mutate(sensor_failure = as.factor(
ifelse(
# "real" instances of extreme values don't begin until November
(time_const < 1289988000 & absolute_humidity < 0.24) |
# time period from 12/14 to 2/12
(time_const >= 1292346000 & time_const <= 1297551600 &
absolute_humidity < 0.24),
1, 0
))
)
)
}
train = lapply(train, addSensorFailure)
test = addSensorFailure(test)
all_data = addSensorFailure(all_data)
We can check whether it was encoded correctly:
plotSensorFail = function(.data, variable) {
return(
ggplot2::ggplot(.data, aes(x = date_time, y = variable,
color = sensor_failure)) +
geom_line() +
theme_minimal()
)
}
# store time series in list
failure_check = NULL
for(i in 2:9) {
failure_check[[i]] = (plotSensorFail(all_data, all_data[, i]) +
labs(title = paste("Check Sensor Failure for", variables[i]))) %>%
plotly::ggplotly()
# failure_check[[1]] is unused :(
}
failure_check[[2]]
failure_check[[3]]
failure_check[[4]]
failure_check[[5]]
failure_check[[6]]
failure_check[[7]]
failure_check[[8]]
failure_check[[9]]
check_benzene = ggplot2::ggplot(train$benzene, aes(x = date_time,
y = outcome,
color = sensor_failure)) +
geom_line() +
theme_minimal() +
labs(title = "Check Benzene")
check_benzene %>% plotly::ggplotly()
check_carbon = ggplot2::ggplot(train$carbon, aes(x = date_time,
y = outcome,
color = sensor_failure)) +
geom_line() +
theme_minimal() +
labs(title = "Check Carbon")
check_carbon %>% plotly::ggplotly()
check_nitrogen = ggplot2::ggplot(train$nitrogen, aes(x = date_time,
y = outcome,
color = sensor_failure)) +
geom_line() +
theme_minimal() +
labs(title = "Check Nitrogen")
check_nitrogen %>% plotly::ggplotly()
As observed in the graphs, not only are our sensor failures encoded correctly, but our previous intuition seems to hold true: sensor failures seem associated with predictable differences only for benzene outcomes, and not for the carbon/nitrogen outcomes. Thus, rather than include another variable, we can instead enforce a simple rule for the benzene outcome: if the sensor fails, set the benzene outcome to 0.1.
Otherwise, we see that sensor failures have varying effects on each of the predictor variables. This ranges from very minute blips (such as in sensor_1) to very high inaccuracies (as in sensor_2 and sensor_4). Thus, while sensor failures do not affect the carbon and nitrogen outcomes directly, our model still might be influence from these failures due to its effects on predictor variables.
Luckily, because we are working with time series data, we can impute our data with some accuracy by replicating a comparable time. We do so for the testing data using naive forecasting; for the training data, we simply drop observations with sensor failures.
train = lapply(train, dplyr::filter, sensor_failure == "0")
all_data = all_data[c(1:7111, 7113:9358), ]
# imputing for period 1
Impute1 = function(whole_data, var_name, dates) {
temp = whole_data[7132:7155, ]
week_before_1 = temp %>%
fable::as_tsibble(index = date_time) %>%
fabletools::model(snaive = SNAIVE(as.formula(paste0(var_name, "~ lag(\"day\")")))) %>%
fabletools::forecast(h = "2.2 days")
if(dates) {
week_before_1 = week_before_1[, c(2, 4)]
}
else {
week_before_1 = week_before_1[, 4]
}
return(week_before_1[1:52, ])
}
imputed1 = Impute1(all_data, variables[2], TRUE)
for(i in 3:9) {
imputed1 = cbind(imputed1, Impute1(all_data, variables[i], FALSE))
}
colnames(imputed1) = variables[1:9]
# imputing for period 2
Impute2 = function(whole_data, var_name, dates) {
temp = whole_data[7751:7775, ]
week_before_2 = temp %>%
fable::as_tsibble(index = date_time) %>%
fabletools::model(snaive = SNAIVE(as.formula(paste0(var_name, "~ lag(\"day\")")))) %>%
fabletools::forecast(h = "0.5 day")
if(dates) {
week_before_2 = week_before_2[, c(2, 4)]
}
else {
week_before_2 = week_before_2[, 4]
}
return(week_before_2[1:9, ])
}
imputed2 = Impute2(all_data, variables[2], TRUE)
for(i in 3:9) {
imputed2 = cbind(imputed2, Impute2(all_data, variables[i], FALSE))
}
colnames(imputed2) = variables[1:9]
# imputing for period 3
Impute3 = function(whole_data, var_name, dates) {
temp = whole_data[7980:8039, ]
week_before_3 = temp %>%
fable::as_tsibble(index = date_time) %>%
fabletools::model(snaive = SNAIVE(as.formula(paste0(var_name, "~ lag(\"2 days\")")))) %>%
fabletools::forecast(h = "3.2 days")
if(dates) {
week_before_3 = week_before_3[, c(2, 4)]
}
else {
week_before_3 = week_before_3[, 4]
}
return(week_before_3[1:76, ])
}
imputed3 = Impute3(all_data, variables[2], TRUE)
for(i in 3:9) {
imputed3 = cbind(imputed3, Impute3(all_data, variables[i], FALSE))
}
colnames(imputed3) = variables[1:9]
# directly add to dataframe
test[46:97, 1:9] = imputed1
test[666:674, 1:9] = imputed2
test[930:1005, 1:9] = imputed3
# training testing split
train_splits = lapply(train, rsample::initial_split, strata = outcome)
# boosted tree model
boost = parsnip::boost_tree(
trees = tune::tune(),
learn_rate = tune::tune(),
tree_depth = tune::tune(),
sample_size = tune::tune()
) %>%
parsnip::set_engine("xgboost") %>%
parsnip::set_mode("regression")
# feature engineering
CreateRecipe = function(.data) {
recipe = recipes::recipe(outcome ~ .,
data = .data) %>%
recipes::step_rm(date_time, temp_date, sensor_failure) %>%
recipes::step_normalize(recipes::all_numeric_predictors()) %>%
recipes::step_dummy(time, quarter, weekend)
return(recipe)
}
recipes = lapply(train, CreateRecipe)
# workflows
CreateWorkflow = function(recipe, model) {
to_return = workflows::workflow() %>%
workflows::add_model(model) %>%
workflows::add_recipe(recipe)
return(to_return)
}
boost_workflows = lapply(recipes, CreateWorkflow, model = boost)
# examine for correct specification
boost_workflows$carbon
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_rm()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## trees = tune::tune()
## tree_depth = tune::tune()
## learn_rate = tune::tune()
## sample_size = tune::tune()
##
## Computational engine: xgboost
To create a training-testing split, we do a random split. While time splits are often favored in time series analysis, the lack of data during the Winter towards the end of the training period may cause some problems during the split, especially since a majority of the data in the testing set is from the Winter. Due to its well-known success in regression models, we begin by modeling with a simple boosted tree model. We also conduct basic feature engineering using the recipes package, removing unused variables, normalizing all numeric predictors, and creating dummy variables for time and season.
# building rolling origin cross validation
CrossValidation = function(split) {
training = split %>%
rsample::training()
fold = training %>%
rsample::rolling_origin(cumulative = FALSE,
initial = 144,
assess = 24,
skip = 672)
return(fold)
}
cross = lapply(train_splits, CrossValidation)
nrow(train_splits[[1]])
# tuning
boost_tuning_grid = dials::grid_regular(dials::parameters(boost),
levels = 8)
boost_tune_model = NULL # stores tuning grid
for (i in 1:3) {
boost_tune_model[[i]] = tune::tune_grid(
boost_workflows[[i]],
resamples = cross[[i]],
grid = boost_tuning_grid,
control = control_grid(verbose = TRUE)
)
}
Since we are analyzing time series data, we use rolling origin resampling to validate results. Thus, we begin basic hyperparameter tuning for our boosted tree model, tuning for learning rate, sample size, tree depth, and number of trees. Once hyperparameters are gathered, we observe and select the best hyperparameters:
boost_tune_model[[1]] %>%
tune::autoplot() + theme_minimal()
boost_tune_model[[2]] %>%
tune::autoplot() + theme_minimal()
boost_tune_model[[3]] %>%
tune::autoplot() + theme_minimal()
boost_best = NULL # store best hyperparameters
for (i in 1:3) {
boost_best[[i]] = tune::select_best(boost_tune_model[[i]], metric = "rmse")
}
# see hyperparameters
boost_best[[1]]
## # A tibble: 1 x 5
## trees tree_depth learn_rate sample_size .config
## <int> <int> <dbl> <dbl> <chr>
## 1 1714 1 0.00518 0.486 Preprocessor1_Model0415
boost_best[[2]]
## # A tibble: 1 x 5
## trees tree_depth learn_rate sample_size .config
## <int> <int> <dbl> <dbl> <chr>
## 1 2000 3 0.00518 0.486 Preprocessor1_Model0928
boost_best[[3]]
## # A tibble: 1 x 5
## trees tree_depth learn_rate sample_size .config
## <int> <int> <dbl> <dbl> <chr>
## 1 572 7 0.1 0.357 Preprocessor1_Model2003
Based on the results of the tuning above, we find our ideal hyperparameters for the boosted model above and include those results. Finally, we create our new models, training using the full training dataset, and we evaluate model performance from the train_test split as well as determine variable importance.
finalizeWork = function(workflow, best) {
to_return = workflow %>%
tune::finalize_workflow(best)
return(to_return)
}
boost_final_workflows = mapply(finalizeWork,
workflow = boost_workflows,
best = boost_best,
SIMPLIFY = FALSE)
# check workflow model
boost_final_workflows[[1]]
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_rm()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## trees = 1714
## tree_depth = 1
## learn_rate = 0.00517947467923122
## sample_size = 0.485714285714286
##
## Computational engine: xgboost
# creates last fits for checking model performance
checkPerf = function(.index, final, split) {
return(
final[[.index]] %>%
tune::last_fit(split = split[[.index]])
)
}
# creates plots for variable importance
evalVar = function(workflow) {
return(
workflow %>%
purrr::pluck(".workflow", 1) %>%
workflows::pull_workflow_fit() %>%
vip::vip(num_features = 35) +
theme_minimal() #+
#ylim(0, 0.1)
)
}
# create last fits
boost_perf1 = checkPerf(1, boost_final_workflows, train_splits)
boost_perf2 = checkPerf(2, boost_final_workflows, train_splits)
boost_perf3 = checkPerf(3, boost_final_workflows, train_splits)
boost_perf = list(boost_perf1, boost_perf2, boost_perf3)
# check model performance
lapply(boost_perf, collect_metrics)
## [[1]]
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.443 Preprocessor1_Model1
## 2 rsq standard 0.906 Preprocessor1_Model1
##
## [[2]]
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 1.20 Preprocessor1_Model1
## 2 rsq standard 0.975 Preprocessor1_Model1
##
## [[3]]
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 43.9 Preprocessor1_Model1
## 2 rsq standard 0.943 Preprocessor1_Model1
lapply(boost_perf, evalVar)
## [[1]]
##
## [[2]]
##
## [[3]]
# fits model using all training data
fitModel = function(final, training) {
return(
final %>%
fit(data = training)
)
}
# create fitted models
boost_fitted_models = mapply(fitModel,
final = boost_final_workflows,
training = train,
SIMPLIFY = FALSE)
boost_final_workflows[[1]]
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_rm()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## trees = 1714
## tree_depth = 1
## learn_rate = 0.00517947467923122
## sample_size = 0.485714285714286
##
## Computational engine: xgboost
We see that while model 2 performs quite well in terms of rmse and rsq, model 1 and model 3 are less accurate.
We then apply the model:
# predict using testing data
boost_pred_carbon = boost_fitted_models[[1]] %>%
predict(new_data = test)
boost_pred_benzene = boost_fitted_models[[2]] %>%
predict(new_data = test)
boost_pred_nitrogen = boost_fitted_models[[3]] %>%
predict(new_data = test)
boost_solution = as.data.frame(cbind(date_time = test_id,
target_carbon_monoxide = boost_pred_carbon$.pred,
target_benzene = boost_pred_benzene$.pred,
target_nitrogen_oxides = boost_pred_nitrogen$.pred))
readr::write_csv(boost_solution, "boost.csv")
We note that for broken sensor values, we will apply a transformation to the benzene observations to \(= 0.1\).
Time series data are inherently very noisy, so a model robust to outliers and noise would be ideal for fitting. One method of addressing this is to ensemble our boosted tree model (which is particularly prone to outliers) with a Random Forest model, which is less prone to these errors.
# str(train_splits) # train-test splits established prior
# specify ranger model
rf = parsnip::rand_forest(
mtry = dials::finalize(mtry(), test[, -1]) %>% tune::tune(),
trees = tune::tune(),
min_n = tune::tune()
) %>%
parsnip::set_engine("ranger") %>%
parsnip::set_mode("regression")
# str(recipes) # same preprocessor established prior
# create workflows
rf_workflows = lapply(recipes, CreateWorkflow, model = rf)
# check specification
rf_workflows$carbon
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_rm()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = dials::finalize(mtry(), test[, -1]) %>% tune::tune()
## trees = tune::tune()
## min_n = tune::tune()
##
## Computational engine: ranger
In a similar fashion to before, we use rolling origin resampling to tune hyperparameters.
# str(cross) # same specification as before
# tuning
rf_tuning_grid = dials::grid_regular(mtry() %>% range_set(c(4, 12)),
min_n(),
trees(),
levels = 8)
rf_tune_model = NULL # stores tuning grid
for (i in 1:3) {
rf_tune_model[[i]] = tune::tune_grid(
rf_workflows[[i]],
resamples = cross[[i]],
grid = rf_tuning_grid,
control = control_grid(verbose = TRUE)
)
}
We can then visualize and select the best hyperparameters:
rf_tune_model[[1]] %>%
tune::autoplot() + theme_minimal()
rf_tune_model[[2]] %>%
tune::autoplot() + theme_minimal()
rf_tune_model[[3]] %>%
tune::autoplot() + theme_minimal()
rf_best = NULL # store best hyperparameters
for (i in 1:3) {
rf_best[[i]] = tune::select_best(rf_tune_model[[i]], metric = "rmse")
}
# see hyperparameters
rf_best[[1]]
## # A tibble: 1 x 4
## mtry trees min_n .config
## <int> <int> <int> <chr>
## 1 8 286 7 Preprocessor1_Model077
rf_best[[2]]
## # A tibble: 1 x 4
## mtry trees min_n .config
## <int> <int> <int> <chr>
## 1 12 857 2 Preprocessor1_Model200
rf_best[[3]]
## # A tibble: 1 x 4
## mtry trees min_n .config
## <int> <int> <int> <chr>
## 1 12 286 7 Preprocessor1_Model080
rf_final_workflows = mapply(finalizeWork,
workflow = rf_workflows,
best = rf_best,
SIMPLIFY = FALSE)
# check workflow model
rf_final_workflows[[1]]
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_rm()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 8
## trees = 286
## min_n = 7
##
## Computational engine: ranger
# create last fits
rf_perf1 = checkPerf(1, rf_final_workflows, train_splits)
rf_perf2 = checkPerf(2, rf_final_workflows, train_splits)
rf_perf3 = checkPerf(3, rf_final_workflows, train_splits)
rf_perf = list(rf_perf1, rf_perf2, rf_perf3)
# check model performance
lapply(rf_perf, collect_metrics)
## [[1]]
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.374 Preprocessor1_Model1
## 2 rsq standard 0.932 Preprocessor1_Model1
##
## [[2]]
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 1.12 Preprocessor1_Model1
## 2 rsq standard 0.978 Preprocessor1_Model1
##
## [[3]]
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 46.3 Preprocessor1_Model1
## 2 rsq standard 0.937 Preprocessor1_Model1
# create fitted models
rf_fitted_models = mapply(fitModel,
final = rf_final_workflows,
training = train,
SIMPLIFY = FALSE)
rf_final_workflows[[1]]
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_rm()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 8
## trees = 286
## min_n = 7
##
## Computational engine: ranger
Noticeably, the random forest model performs comparably or even better than the boosted tree in terms of rmse and rsq. We thus gather our predictions:
# predict using testing data
rf_pred_carbon = rf_fitted_models[[1]] %>%
predict(new_data = test)
rf_pred_benzene = rf_fitted_models[[2]] %>%
predict(new_data = test)
rf_pred_nitrogen = rf_fitted_models[[3]] %>%
predict(new_data = test)
rf_solution = as.data.frame(cbind(date_time = test_id,
target_carbon_monoxide = rf_pred_carbon$.pred,
target_benzene = rf_pred_benzene$.pred,
target_nitrogen_oxides = rf_pred_nitrogen$.pred))
readr::write_csv(rf_solution, "rf.csv")
and we can now ensemble by simply averaging the two solutions (as well as imposing the 0.1 benzene rule discussed previously):
boost_solution = read_csv("boost.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## date_time = col_datetime(format = ""),
## target_carbon_monoxide = col_double(),
## target_benzene = col_double(),
## target_nitrogen_oxides = col_double()
## )
rf_solution = rf_solution %>%
dplyr::rename(rf_date = "date_time",
rf_carbon = "target_carbon_monoxide",
rf_benzene = "target_benzene",
rf_nit = "target_nitrogen_oxides")
full_soln = cbind(boost_solution, rf_solution) %>%
dplyr::mutate(date_time = as.character(date_time),
target_carbon_monoxide = (target_carbon_monoxide + as.numeric(rf_carbon)) / 2,
target_benzene = (target_benzene + as.numeric(rf_benzene)) / 2,
target_nitrogen_oxides = (target_nitrogen_oxides + as.numeric(rf_nit)) / 2) %>%
# impose the 0.1 benzene rule
dplyr::select(c(1:4)) %>%
cbind(test %>% select("sensor_failure")) %>%
dplyr::mutate(target_benzene = ifelse(sensor_failure == 1, 0.1, target_benzene)) %>%
dplyr::select(c(1:4)) %>%
dplyr::mutate(date_time = as.character(date_time))
readr::write_csv(full_soln, "full.csv")